1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.extractor;
28 import java.net.URLDecoder;
29 import org.apache.commons.lang.StringUtils;
30 import org.apache.log4j.Logger;
31 import org.smartcrawler.common.SCLogger;
32
33
34 /***
35 *
36 *
37 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
38 * @version <tt>$Revision: 1.11 $</tt>
39 */
40 public class HtmlURLImpl implements HtmlURL {
41
42 private String extractedURL;
43 private String cleanedURL;
44
45 private static Logger log = SCLogger.getLogger(HtmlURLImpl.class);
46
47 /***
48 *
49 * @param extractedURL
50 */
51 public HtmlURLImpl(String extractedURL) {
52 this.extractedURL = extractedURL.trim();
53
54 this.cleanedURL = clean();
55 }
56
57 /***
58 *
59 * @return
60 */
61 public boolean isValid() {
62 return !(
63 cleanedURL.equals("..") ||
64 cleanedURL.startsWith("#") ||
65 cleanedURL.toLowerCase().startsWith("javascript:") ||
66 cleanedURL.toLowerCase().startsWith("mailto:")
67 );
68 }
69
70 /***
71 *
72 * @return
73 */
74 public String getCleanedLinkAsString() {
75 return this.cleanedURL;
76 }
77
78 /***
79 *
80 * @return
81 */
82 protected String clean() {
83 log.debug("clean(): BEGIN");
84
85 String res = null;
86 if (extractedURL != null) {
87
88 res = extractedURL.replace("\"", " ").trim();
89 res = res.replace("'", " ").trim();
90
91 if (res.toLowerCase().endsWith("/")) {
92 res = res.substring(0, res.length() - 1);
93 }
94 if (res.toLowerCase().startsWith("./")) {
95 res = res.substring(2, res.length());
96 }
97
98
99 res = StringUtils.replace(res, "&", "&");
100 try {
101 res = URLDecoder.decode(res,"UTF-8");
102 } catch(Exception e){}
103 }
104 log.debug("clean(): " + extractedURL + "->" + res);
105 log.debug("clean(): END");
106 return res;
107 }
108
109 /***
110 *
111 * @return
112 */
113 public int getType() {
114 int type = -1;
115 if (this.cleanedURL.startsWith("/"))
116 type = LINK_ABSOLUTE_URI;
117 else if (this.cleanedURL.toLowerCase().startsWith("http://") ||
118 this.cleanedURL.toLowerCase().startsWith("https://"))
119 type = LINK_ABSOLUTE_URL;
120 else
121 type = LINK_RELATIVE;
122
123 return type;
124 }
125 }